import duckdb
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"
#pio.renderers.default = "notebook"
duckdb.execute("SET enable_progress_bar = false")
default_layout = dict(
margin=dict(l=55, r=55, b=55, t=55),
font=dict(family="Times", size=15),
title_x=0.5,
paper_bgcolor="#f5f2f0",
plot_bgcolor="#f5f2f0",
xaxis=dict(tickcolor="black", linecolor="black", showgrid=True, gridcolor="darkgray", zerolinecolor="darkgray"),
yaxis=dict(
tickcolor="black",
linecolor="black",
showgrid=True,
gridcolor="darkgray",
zerolinecolor="darkgray",
rangemode="tozero",
),
)
map_layout = dict(
images=[dict(
source='../assets/background_map.png',
xref='x',
yref='y',
x=0,
y=180,
sizex=360,
sizey=180,
layer='below'
)],
xaxis=dict(showgrid=False, visible=False, range=[0, 360]),
yaxis=dict(showgrid=False, visible=False, scaleanchor='x', scaleratio=1, range=[0, 180]),
coloraxis=dict(
colorscale=[
[0, "rgba(255,255,255,0)"],
[0.00000001, "rgb(12,51,131)"],
[1/1000, "rgb(10,136,186)"],
[1/100, "rgb(242,211,56)"],
[1/10, "rgb(242,143,56)"],
[1, "rgb(217,30,30)"],
],
),
)
df_monthly_contributors = duckdb.sql("""
SELECT
CONCAT(year, '-', LPAD(CAST(month as VARCHAR), 2, '0')) as Months,
COUNT(DISTINCT user_name) as 'Contributors'
FROM '../data_enriched/year=*/month=*/*.parquet'
GROUP BY year, month
ORDER BY year, month
""").df()[:-1]
df_new_contributors_monthly = duckdb.sql("""
WITH user_first_appearance AS (
SELECT
user_name,
year,
month,
ROW_NUMBER() OVER (PARTITION BY user_name ORDER BY year, month) as rn
FROM (
SELECT DISTINCT user_name, year, month
FROM '../data_enriched/year=*/month=*/*.parquet'
)
),
first_appearances AS (
SELECT user_name, year, month
FROM user_first_appearance
WHERE rn = 1
),
monthly_new_contributors AS (
SELECT
CONCAT(year, '-', LPAD(CAST(month as VARCHAR), 2, '0')) as Months,
COUNT(DISTINCT user_name) as 'New Contributors'
FROM first_appearances
GROUP BY year, month
ORDER BY year, month
)
FROM monthly_new_contributors
""").df()[:-1]
df_edit_count_monthly = duckdb.sql("""
SELECT
CONCAT(year, '-', LPAD(CAST(month as VARCHAR), 2, '0')) as months,
CAST(SUM(edit_count) as BIGINT) as edits
FROM '../data_enriched/year=*/month=*/*.parquet'
GROUP BY year, month
ORDER BY year, month
""").df()[:-1]
df_changeset_count_monthly = duckdb.sql("""
SELECT
CONCAT(year, '-', LPAD(CAST(month as VARCHAR), 2, '0')) as months,
CAST(COUNT(*) AS INTEGER) as changesets
FROM '../data_enriched/year=*/month=*/*.parquet'
GROUP BY year, month
ORDER BY year, month
""").df()[:-1]
fig = go.Figure()
buttons = []
fig.add_trace(
go.Scatter(
x=df_monthly_contributors.iloc[:, 0], y=df_monthly_contributors.iloc[:, 1], name="Contributors", visible=True
)
)
fig.add_trace(
go.Scatter(
x=df_new_contributors_monthly.iloc[:, 0],
y=df_new_contributors_monthly.iloc[:, 1],
name="New Contributors",
visible=True,
)
)
buttons.append({
"label": "Contributors",
"args": [{"visible": [True, True, False, False]}, {"title.text": "Monthly Contributors"}],
"method": "update"
})
fig.add_trace(
go.Scatter(
x=df_edit_count_monthly.iloc[:, 0],
y=df_edit_count_monthly.iloc[:, 1],
name="Edits",
visible=False,
)
)
buttons.append({
"label": "Edits",
"args": [{"visible": [False, False, True, False]}, {"title.text": "Monthly Edits"}],
"method": "update"
})
fig.add_trace(
go.Scatter(
x=df_changeset_count_monthly.iloc[:, 0],
y=df_changeset_count_monthly.iloc[:, 1],
name="Changesets",
visible=False,
)
)
buttons.append({
"label": "Changesets",
"args": [{"visible": [False, False, False, True]}, {"title.text": "Monthly Changesets"}],
"method": "update"
})
fig.for_each_trace(lambda trace: trace.update(hovertemplate="%{x}<br>%{y:,} " + trace.name + "<extra></extra>"))
fig.update_layout(
title="Monthly Contributors",
updatemenus=[{"type": "buttons", "buttons": buttons}],
**default_layout,
)
fig.show()
fig = go.Figure()
buttons = []
fig.add_trace(
go.Scatter(
x=df_new_contributors_monthly.iloc[:, 0],
y=df_new_contributors_monthly.iloc[:, 1].cumsum(),
name="Contributors",
visible=True,
)
)
buttons.append({
"label": "Contributors",
"args": [{"visible": [True, False, False]}, {"title.text": "Aggregated Contributors"}],
"method": "update"
})
fig.add_trace(
go.Scatter(
x=df_edit_count_monthly.iloc[:, 0],
y=df_edit_count_monthly.iloc[:, 1].cumsum(),
name="Edits",
visible=False,
)
)
buttons.append({
"label": "Edits",
"args": [{"visible": [False, True, False]}, {"title.text": "Aggregated Edits"}],
"method": "update"
})
fig.add_trace(
go.Scatter(
x=df_changeset_count_monthly.iloc[:, 0],
y=df_changeset_count_monthly.iloc[:, 1].cumsum(),
name="Changesets",
visible=False,
)
)
buttons.append({
"label": "Changesets",
"args": [{"visible": [False, False, True]}, {"title.text": "Aggregated Changesets"}],
"method": "update"
})
fig.for_each_trace(lambda trace: trace.update(hovertemplate="%{x}<br>%{y:,} " + trace.name + "<extra></extra>"))
fig.update_layout(
title="Aggregated Contributors",
updatemenus=[{"type": "buttons", "buttons": buttons}],
**default_layout,
)
fig.show()
df = duckdb.sql("""
SELECT
mid_pos_x as x,
mid_pos_y as y,
SUM(edit_count) as z
FROM '../data_enriched/year=*/month=*/*.parquet'
WHERE mid_pos_x IS NOT NULL AND mid_pos_y IS NOT NULL
GROUP BY mid_pos_x, mid_pos_y
""").df()
fig = go.Figure(go.Histogram2d(
x=df['x'].values,
y=df['y'].values,
z=df['z'].values,
histfunc='sum',
xbins=dict(start=0, end=360, size=1),
ybins=dict(start=0, end=180, size=1),
coloraxis='coloraxis'
))
fig.update_layout(title="Edit Count", **{**default_layout, **map_layout})
fig.show()
This shows the contribution patterns of users grouped by their first edit period (2-year periods), displaying the percentage of total yearly edits from each cohort.
sql_query = """
WITH user_first_edit AS (
-- Find the first edit year for each user
SELECT
user_name,
MIN(year) as first_edit_year
FROM
read_parquet('../data_enriched/year=*/month=*/*.parquet')
GROUP BY
user_name
),
year_user_edits AS (
-- Sum edits by year and user
SELECT
year,
user_name,
SUM(edit_count) as total_edits
FROM
read_parquet('../data_enriched/year=*/month=*/*.parquet')
GROUP BY
year, user_name
),
merged_data AS (
-- Join to get first edit year for each user's yearly edits
SELECT
y.year,
y.user_name,
y.total_edits,
u.first_edit_year,
CASE
-- Create 2-year periods based on odd/even years
WHEN u.first_edit_year % 2 = 1 THEN
CONCAT(CAST(u.first_edit_year AS VARCHAR), '-', CAST(u.first_edit_year + 1 AS VARCHAR))
ELSE
CONCAT(CAST(u.first_edit_year - 1 AS VARCHAR), '-', CAST(u.first_edit_year AS VARCHAR))
END as first_edit_period
FROM
year_user_edits y
JOIN
user_first_edit u ON y.user_name = u.user_name
)
SELECT
CAST(year AS VARCHAR) as years,
first_edit_period,
SUM(total_edits) as total_edits
FROM
merged_data
GROUP BY
year, first_edit_period
ORDER BY
year, first_edit_period
"""
df = duckdb.sql(sql_query).df()
yearly_totals = duckdb.sql("""
SELECT
CAST(year as VARCHAR) as years,
CAST(SUM(edit_count) as BIGINT) as edits
FROM '../data_enriched/year=*/month=*/*.parquet'
GROUP BY year
ORDER BY year
""").df()
df_pivoted = df.pivot(index="years", columns="first_edit_period", values="total_edits").fillna(0)
df_percent = df_pivoted.copy()
for year in df_percent.index:
year_total = yearly_totals[yearly_totals['years'] == year]['edits'].iloc[0]
df_percent.loc[year] = (df_percent.loc[year] / year_total * 100).round(2)
fig = go.Figure()
buttons = []
for period in df_pivoted.columns:
fig.add_trace(
go.Bar(
x=df_pivoted.index,
y=df_pivoted[period],
name=f"First edit in: {period}",
visible=True,
hovertemplate=f"First edit in: {period}" + "<br>%{y:,} edits<extra></extra>"
)
)
for period in df_percent.columns:
fig.add_trace(
go.Bar(
x=df_percent.index,
y=df_percent[period],
name=f"First edit in: {period}",
visible=False,
hovertemplate=f"First edit in: {period}" + "<br>%{y:.1f}% of total edits<extra></extra>"
)
)
buttons.append({
"label": "Absolute",
"args": [
{"visible": [True] * len(df_percent.columns) + [False] * len(df_percent.columns)},
{"title.text": "Contributor Attrition Rate by First Edit Period (absolute edits)",
"yaxis.title.text": "Number of Edits"}
],
"method": "update"
})
buttons.append({
"label": "Percentage",
"args": [
{"visible": [False] * len(df_percent.columns) + [True] * len(df_percent.columns)},
{"title.text": "Contributor Attrition Rate by First Edit Period (% of yearly edits)",
"yaxis.title.text": "Percentage of Total Edits (%)"}
],
"method": "update"
})
fig.update_layout(
title="Contributor Attrition Rate by First Edit Period",
yaxis_title="Number of Edits",
barmode='stack',
updatemenus=[{"type": "buttons", "buttons": buttons}],
**default_layout,
)
fig.show()
There is a big spike of new contributors in April 2016. This is because a lot of users using MAPS.ME were creating their first edits.
df = duckdb.sql("""
WITH user_first_appearance AS (
SELECT
user_name,
year,
month,
created_by,
ROW_NUMBER() OVER (PARTITION BY user_name ORDER BY year, month) as rn
FROM (
SELECT DISTINCT user_name, year, month, created_by
FROM '../data_enriched/year=*/month=*/*.parquet'
)
),
first_appearances AS (
SELECT user_name, year, month, created_by
FROM user_first_appearance
WHERE rn = 1
),
monthly_new_contributors AS (
SELECT
CONCAT(year, '-', LPAD(CAST(month as VARCHAR), 2, '0')) as Months,
COUNT(DISTINCT CASE WHEN created_by != 'MAPS.ME' OR created_by IS NULL THEN user_name END) as 'New Contributors without MAPS.ME',
COUNT(DISTINCT CASE WHEN created_by = 'MAPS.ME' THEN user_name END) as 'New Contributors only MAPS.ME'
FROM first_appearances
GROUP BY year, month
ORDER BY year, month
)
SELECT * FROM monthly_new_contributors
""").df()[:-1]
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.iloc[:, 0], y=df.iloc[:, 1], name="New Contributors without MAPS.ME"))
fig.add_trace(go.Scatter(x=df.iloc[:, 0], y=df.iloc[:, 2], name="New Contributors only MAPS.ME"))
fig.for_each_trace(lambda trace: trace.update(hovertemplate="%{x}<br>%{y:,} " + trace.name + "<extra></extra>"))
fig.update_layout(title="Monthly New Contributors with and without MAPS.ME", **default_layout)
fig.show()
This shows the median number of edits made by contributors each year, providing insight into typical contributor activity levels.
df_median_edits_yearly = duckdb.sql("""
WITH yearly_contributor_edits AS (
SELECT
year,
user_name,
CAST(SUM(edit_count) as INTEGER) as user_edits
FROM '../data_enriched/year=*/month=*/*.parquet'
GROUP BY year, user_name
)
SELECT
CAST(year AS VARCHAR) as years,
CAST(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY user_edits) as INTEGER) as "median number of edits per contributor"
FROM yearly_contributor_edits
GROUP BY year
ORDER BY year
""").df()
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=df_median_edits_yearly.iloc[:, 0],
y=df_median_edits_yearly.iloc[:, 1],
mode='lines+markers',
name="Median edits per contributor",
line=dict(width=3),
marker=dict(size=8)
)
)
fig.update_traces(hovertemplate="%{x}<br>%{y:,} edits<extra></extra>")
fig.update_layout(
title="Median Number of Edits per Contributor",
xaxis_title="Year",
yaxis_title="Median Number of Edits",
**default_layout,
)
fig.show()